In this notebook we will cover an approach to do an exploratory data analysis with the data of the Brazilian general elections of 2018.
We will fetch the data from the oficial governemnt repository available at the webpage of the TSE (Tribunal Superior Eleitoral.)
require(rvest) # web scrapping
require(plotly) # data visualization
require(ggplot2) # data visualization
require(lubridate) # format dates
require(tidyverse) # data cleansing
wd <- '~/dev/r' # '/cloud/project'
project <- 'brz_elections'
raw_data <- 'raw_data'
dest_path <- file.path(wd, project)
if (!dir.exists(dest_path)) {
dir.create(dest_path) # create path case it does not exist
print(paste("Folder", dest_path , "created at", wd))
} else {
print(paste("Folder", dest_path , "already exists"))
}
[1] "Folder ~/dev/r/brz_elections already exists"
dest_path <- file.path(dest_path, raw_data)
if (!dir.exists(dest_path)) {
dir.create(dest_path) # create directory
print(paste("Folder ", dest_path , "created sucessfully "))
} else {
print(paste("Folder ", dest_path , "already exists"))
}
[1] "Folder ~/dev/r/brz_elections/raw_data already exists"
setwd(dest_path)
The working directory was changed to /home/gaston/dev/r/brz_elections inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
# main page with the source data
target_url <- "http://www.tse.jus.br/hotsites/pesquisas-eleitorais/prestacao_contas_anos/2018.html"
# get html from source URL above
html <- read_html(target_url)
# use css selector to just filter the path with the other url adresses
nodes <- html_nodes(html, 'div p a')
# final fetch of the url addresses with the raw data
zip_url <- html_attr(nodes, 'href')
# now need to do a for loop in this case
# r handles the download of the whole vector
# *** WARNING ***
# some files are huge - >= 200 MB
download.file(url = zip_url,
destfile = file.path(dest_path, basename(zip_url)),
mode = 'wb')
trying URL 'http://agencia.tse.jus.br/estatistica/sead/odsele/prestacao_contas/prestacao_de_contas_eleitorais_orgaos_partidarios_2018.zip'
trying URL 'http://agencia.tse.jus.br/estatistica/sead/odsele/prestacao_contas/prestacao_de_contas_eleitorais_candidatos_2018.zip'
trying URL 'http://agencia.tse.jus.br/estatistica/sead/odsele/prestacao_contas/CNPJ_campanha_2018.zip'
for (f in file.path(dest_path, basename(zip_url))) {
print(f)
unzip(f, exdir = dest_path)
}
[1] "~/dev/r/brz_elections/raw_data/prestacao_de_contas_eleitorais_orgaos_partidarios_2018.zip"
error 1 in extracting from zip file
[1] "~/dev/r/brz_elections/raw_data/prestacao_de_contas_eleitorais_candidatos_2018.zip"
[1] "~/dev/r/brz_elections/raw_data/CNPJ_campanha_2018.zip"
error 1 in extracting from zip file
# delete redundant files that will not be used
pattern <- '*BRASIL*'
files_to_remove <- grep(dir(dest_path), pattern = pattern, inv=T, value = T)
file.remove(file.path(dest_path,files_to_remove))
cannot remove file '~/dev/r/brz_elections/raw_data/a', reason 'Directory not empty'
[1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[20] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
despesa_c <- read.csv(file.path(dest_path, 'despesas_contratadas_candidatos_2018_BRASIL.csv'),
sep = ';',
dec = ',',
encoding = 'latin1')
rm_columns <- c('NR_CPF_VICE_CANDIDATO','SG_UF','CD_ELEICAO','DS_ELEICAO',
'CD_TIPO_ELEICAO','NM_TIPO_ELEICAO','ANO_ELEICAO','DT_GERACAO',
'HH_GERACAO','TP_PRESTACAO_CONTAS','SQ_PRESTADOR_CONTAS','NR_CPF_CANDIDATO',
'NM_PARTIDO','NR_PARTIDO','DS_CNAE_FORNECEDOR','CD_TIPO_FORNECEDOR','CD_CNAE_FORNECEDOR',
'DS_ESFERA_PART_FORNECEDOR','CD_ESFERA_PART_FORNECEDOR','SG_UF_FORNECEDOR',
'CD_MUNICIPIO_FORNECEDOR','NM_MUNICIPIO_FORNECEDOR','SQ_CANDIDATO_FORNECEDOR',
'NR_CANDIDATO_FORNECEDOR','CD_CARGO_FORNECEDOR','DS_ORIGEM_DESPESA',
'CD_ORIGEM_DESPESA','SQ_PARCELAMENTO_DESPESA')
despesa_c[, rm_columns] <-NULL
saveRDS(receita, file.path(dest_path, 'clean_revenue.rds'))
saveRDS(despesa, file.path(dest_path, 'clean_expenses_paid.rds'))
saveRDS(despesa_c, file.path(dest_path, 'clean_expenses_hired.rds'))